We load the dataset and display a few columns for the first five rows
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
pd.options.display.max_columns = None
#df = pd.read_csv('S:/MHS_Pathfinder/bipolar_prediction/GOLD_&_AURUM_15_04_20.csv', low_memory=False)
df = pd.read_csv('clean_dataset.csv', low_memory=False)
display(df.head())
NB: Because lo_HDL has missing values, pandas interprets the whole column as float by default.
# What columns do we have?
print('\n'.join(sorted(df.columns.to_list())))
# How many unique patients?
print("Total patients:", df.patid.nunique())
print(df['suitable'].value_counts(dropna=False))
Now we only keep patients that are suitable for the inclusion analysis (suitable==1). Those are defined as patients with >2 years of follow-up after exposure_start.
df_old = df.copy()
df = df.loc[df.suitable==1]
print("New total patients:", df.patid.nunique())
# Count values for a few important columns
for item in ['source', 'exposure', 'suitable', 'response2_1', 'responder2', 'symptom_to_exposure', 'exposure_end']:
print(item)
print(df[item].value_counts(dropna=False))
print(df[item].describe())
print()
features13 = ['age_first_exposure', 'age_first_diagnosis', 'symptom_to_exposure', 'psychosis', 'depression', 'mania', 'dominant', 'sex', 'FH_BPD', 'FH_depression', 'FH_psychosis', 'weight', 'self_harm']
features = """Total, N
Age at diagnosis, median (IQR)
Age at medication initiation, median (IQR)
Years between diagnosis and exposure, median (IQR)
Female, n (%)
First presentation mania, n (%)
First presentation depression, n (%)
Depression dominant, n (%)
Psychotic experiences, n (%)
Self-harm history, n (%)
Smoker, n (%)
Family history for bipolar disorder, n (%)
Family history for depression, n (%)
Family history for psychosis, n (%)
Overweight or obese, n (%)"""
featurs_list = features.split('\n')
Table1 = pd.DataFrame(featurs_list, columns=['Features'])
for exp, exp_df in df.groupby('exposure'):
data_list = []
data_list.append(str(len(exp_df)))
for feature in ['age_first_diagnosis', 'age_first_exposure', 'symptom_to_exposure']:
Q3 = np.quantile(exp_df[feature], 0.75)
median = np.quantile(exp_df[feature], 0.5)
median = exp_df[feature].median()
Q1 = np.quantile(exp_df[feature], 0.25)
IQR = Q3 - Q1
data_list.append("{:.2f} ({:.2f})".format(median, IQR))
for feature in ['sex', 'mania', 'depression', 'dominant', 'psychosis', 'self_harm', 'smoker', 'FH_BPD', 'FH_depression', 'FH_psychosis', 'weight']:
if feature == 'sex':
sum = (exp_df[feature] == 'female').sum()
elif feature == 'dominant':
sum = (exp_df[feature] == 'depression').sum()
elif feature == 'smoker':
sum = exp_df[feature].isin(['ex smoker', 'current smoker']).sum()
elif feature == 'weight':
sum = exp_df[feature].isin(['overweight', 'obese']).sum()
else:
sum = exp_df[feature].sum()
data_list.append("{} ({:.2f} %)".format(sum,sum/len(exp_df)*100))
Table1[exp] = data_list
display(Table1)
Some values are negative. They should not be.
display(df.loc[df.symptom_to_exposure < 0, 'symptom_to_exposure'])
display(df.loc[df.age_first_diagnosis < 0, ['age_first_diagnosis', 'symptom_to_exposure']])
pd.set_option('display.max_rows', 100)
display(df.loc[(df.symptom_to_exposure>65) & (df.symptom_to_exposure<80)])
We modify those illicit values
df.loc[df.symptom_to_exposure<0, 'symptom_to_exposure'] = 0
df.loc[df.age_first_diagnosis<0, 'age_first_diagnosis'] = np.nan
df.loc[df.symptom_to_exposure>=80, 'symptom_to_exposure'] = np.nan
target_multiclass = df.response2_1.replace(1, 2).fillna(1)
df['target_multiclass'] = target_multiclass
target_lithium2y = (df.exposure=='lithium') & (df.response2_1==1)
df['target_lithium2y'] = target_lithium2y.astype(int)
target_exposure = (df.exposure=='olanzapine')
df['target_exposure'] = target_exposure.astype(int)
targets = {
'resp': 'response2_1',
'exp': 'target_exposure', # 0=lithium 1=olanzapine
'multi': 'target_multiclass',
'lithium2y': 'target_lithium2y'
}
for target in targets:
print(df[targets[target]].value_counts())
print()
We have a lot of "age" features. The ones that have few missing values are good candidates: let's keep only the ages with less than 5000 missing values.
age_columns = [col for col in df.columns if col.startswith('age_')]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
age_df = df[age_columns].isna().sum().sort_values(ascending=True)
display(age_df)
features_age = age_df[age_df < 5000].index.to_list()
We now remove features that do not make sense from a clinical perspective, possibly confusing the learning process
features_age.remove('age_BP')
features_age.remove('age_first')
features_age.remove('age_diagnosis')
features_age.remove('age_smoke')
print('age features:', features_age)
We now select a good list of features, agnostic as well as informed
# list of agnostic features (to complement with age features)
features_agnostic = """adhd
FH_suicide
mania
psychosis
relationship
self_harm
sex
sleep
smoker
T2DM
OCD
migraine
hypothyroid
CHD
other_substance_misuse
cannabis
alcohol
depression
FH_anxiety
FH_any
FH_BPD
FH_depression
FH_LD
FH_psychosis
N_dep_b4
N_man_b4
anxiety
stress
hi_LDL
lo_HDL
weight
CKD3
symptom_to_exposure
dominant"""
# list of informed features:
features_informed = """age_first_exposure
age_first_diagnosis
symptom_to_exposure
psychosis
depression
mania
dominant
sex
FH_BPD
FH_depression
FH_psychosis
weight
self_harm
cannabis
anxiety
stress
sleep
other_substance_misuse
relationship
OCD
adhd
smoker
alcohol
FH_suicide
hi_LDL
lo_HDL
CKD3
T2DM
migraine
hypothyroid
CHD
FH_anxiety
FH_any
FH_LD"""
shaky_features="""cannabis
anxiety
stress
sleep
other_substance_misuse
relationship
OCD
adhd
smoker
alcohol
FH_suicide
hi_LDL
lo_HDL
CKD3
T2DM
migraine
hypothyroid
CHD
FH_anxiety
FH_any
FH_LD"""
shaky_features = [ feature.strip() for feature in shaky_features.split('\n') ]
features = {
# informed
'34': [ feature.strip() for feature in features_informed.split('\n') ],
# agnostic
'37' : [ feature.strip() for feature in features_agnostic.split("\n") ] + features_age
}
features.update({
'13' : [feature for feature in features['34'] if feature not in shaky_features]
})
print(len(features['34']), 'informed features:', features['34'])
print(len(features['37']), 'agnostic features:', features['37'])
print(len(features['13']), 'important features:', features['13'])
print(len(shaky_features), 'shaky features:', shaky_features)
print("Agnostic features that are not informed features:", list(set(features['37']) - set(features['34'])))
print("Informed features that are not agnostic features:", list(set(features['34']) - set(features['37'])))
Here we prepare the X dataframe (samples with their features) and the y dataframe (target to predict, for each sample). We also fix the types of a few features to make sure the algorithms will interpret them correctly.
from sklearn import preprocessing, metrics
def prepare(features, target):
# We first remove the samples with N/A for any of the features or label
df_withoutNA = df.dropna(subset=(features + [target]))
df_features = df_withoutNA[features]
# Now we encode all string values in the features into an int value (we might use One Hot encoding later?)
le = preprocessing.LabelEncoder()
to_encode = [key for key in dict(df_features.dtypes) if dict(df.dtypes)[key] not in ['float64', 'int64']]
new_df_features = df_features.copy()
new_df_features.update(df_features[to_encode].apply(le.fit_transform))
new_df_features[to_encode] = new_df_features[to_encode].astype(np.int64)
# Because lo_HDL has missing values, it was interpreted by pandas with
# floats. Now that we removed the missing values, we can interpret as int
if 'lo_HDL' in features:
new_df_features.lo_HDL = new_df_features.lo_HDL.astype(np.int64)
# Now we have our X and y
return new_df_features, df_withoutNA[target]
X = dict()
y = dict()
for feature in features:
for target in targets:
X[feature + '_' + target], y[feature + '_' + target] = prepare(features[feature] + ['exposure'], targets[target])
if target != 'exp':
y[feature + '_' + target] = y[feature + '_' + target].astype(np.int64)
print(feature + '_' + target, len(X[feature + '_' + target]), 'rows')
from scipy.stats import pearsonr, spearmanr
import seaborn as sns
# The difference between the X dictionnary and the X_dict dictionnary is that we drop 'exposure' in X-dict. We don't want it as a feature in X-dict but we need it in X to know the exposure of the patient.
X_dict = dict()
y_dict = dict()
for key in X:
X_dict.update({
key: X[key].drop('exposure', axis=1)
})
X_dict.update({
'num_' + key: X_dict[key].loc[:, X_dict[key].dtypes == np.float64],
'cat_' + key: X_dict[key].loc[:, X_dict[key].dtypes == np.int64],
})
X_dict.update({
'bin_' + key: X_dict['cat_' + key].loc[:, X_dict['cat_' + key].nunique() == 2]
})
y_dict.update({
key: y[key],
'num_' + key: y[key],
'cat_' + key: y[key],
'bin_' + key: y[key],
})
# When we predict exposure or lithium2y, it doesn't make sense to predict separately exposures
if '_exp' not in key and '_lithium2y' not in key:
X_dict.update({
'lit_' + key: X[key].loc[X[key].exposure == 0].drop('exposure', axis=1),
'ola_' + key: X[key].loc[X[key].exposure == 1].drop('exposure', axis=1),
})
y_dict.update({
'lit_' + key : y[key][X[key].loc[X[key].exposure == 0].index],
'ola_' + key : y[key][X[key].loc[X[key].exposure == 1].index],
})
print(X_dict.keys())
print(y_dict.keys())
display(X_dict['cat_13_lithium2y'])
%matplotlib inline
import seaborn as sns
sns.set_theme(style="white")
sns.set(font_scale=2)
from matplotlib.ticker import FormatStrFormatter
values = {0: df.age_first_exposure,
1: df.age_first_diagnosis}
fig, ax = plt.subplots(ncols=2, nrows=1, sharey=True)
plt.subplots_adjust(hspace=0.6)
fig.set_size_inches(10, 6)
fig.tight_layout()
for i in values.keys():
sns.distplot(values[i], kde=False, ax=ax[i], bins=range(100))
# ax[i].set_yscale('log')
ax[i].yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax[i].set_xlim((0,100))
display(df.age_first_diagnosis)
Now let's check the number of missing values for each feature, and look at their entropy
import scipy.stats as st
def entropy3(labels, base=None):
vc = pd.Series(labels).value_counts(normalize=False, sort=False)
base = 2 if base is None else base
#return -(vc * np.log(vc)/np.log(base)).sum()
return st.entropy(vc)
nan_entropy = pd.concat([df[features['37']].isna().sum(), df[features['37']].apply(entropy3, axis=0)], axis=1)
nan_entropy.columns = ['N/A', 'Entropy']
display(nan_entropy)
display(df[features['37']])
key = '34_exp'
import itertools
variables_pearson = X_dict['num_' + key].columns
for variables in itertools.combinations(variables_pearson, 2):
plt.figure(figsize=(16,9))
sns.regplot(x = variables[0], y = variables[1], data = X_dict['num_'+key],
line_kws={"color": "red"})
display(X_dict['num_'+ key])
from pyitlib import discrete_random_variable as drv
from scipy.spatial import distance
def similarity(X,Y):
sim = ((X*Y)+((1-X)*(1-Y)))/len(X)
return(sim.sum())
def coeff(X_dict, y, key, feature):
print(len(features[feature]), 'features')
df_coeff = pd.DataFrame(index=features[feature])
# Now we calculate both for each numerical feature
for feat in features[feature]:
# We calculate correlation coeffs only for numerical and binary categorical
if (feat in X_dict['num_' + key].columns):
df_coeff.loc[feat, 'pearson_r'], df_coeff.loc[feat, 'pearson_p'] = pearsonr(X[key][feat], y[key])
df_coeff.loc[feat, 'spearman_r'], df_coeff.loc[feat, 'spearman_p'] = spearmanr(X[key][feat], y[key])
# We calculate conditional entropy for only categorical (including non binary)
if feat in X_dict['cat_'+ key].columns:
df_coeff.loc[feat, 'cond_entropy'] = drv.entropy_conditional([int(x) for x in X[key][feat]], [int(x) for x in y[key]], base=np.e)
# For binary variables the Jaccard similarity is more approriate
if (feat in X_dict['bin_' + key].columns) and (y[key].dtype != np.float64):
df_coeff.loc[feat, 'jaccard'] = 1-distance.jaccard(X[key][feat], y[key])
df_coeff.loc[feat, 'SMC'] = similarity(X[key][feat], y[key])
# We display the sorted absolute values only when p-value < 0.01
display(df_coeff
.reindex(df_coeff.pearson_r.abs().sort_values(ascending=False).index))
return df_coeff
For binary features:
feature = '34'
for target in targets:
key = feature + '_' + target
print(10*'_' + target + 10*'_')
df_coeff = coeff(X_dict, y, key, feature)
print()
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from collections import Counter
import os
from datetime import date
from sklearn.feature_selection import chi2
from scipy import stats
import seaborn as sns
import matplotlib.pylab as plt
from numpy import percentile
from sklearn.feature_selection import SelectKBest
def chisquare(Y):
X = Y.loc[:, Y.dtypes == np.int64]
column_names=X.columns
chisqmatrix=pd.DataFrame(X,columns=column_names,index=column_names)
outercnt=0
innercnt=0
for icol in column_names:
for jcol in column_names:
mycrosstab = pd.crosstab(X[icol], X[jcol])
stat, p, dof, expected=stats.chi2_contingency(mycrosstab)
chisqmatrix.iloc[outercnt,innercnt] = round(p,3)
cntexpected = expected[expected<5].size
perexpected = ((expected.size-cntexpected)/expected.size)*100
if perexpected < 20:
chisqmatrix.iloc[outercnt,innercnt] = 2
if icol==jcol:
chisqmatrix.iloc[outercnt,innercnt]=0.00
innercnt = innercnt + 1
outercnt = outercnt + 1
innercnt = 0
return chisqmatrix
df_corr_pearson = X_dict['num_'+ key].corr(method='pearson', min_periods=1)
df_corr_spearman = X_dict['num_' + key].corr(method='spearman', min_periods=1)
df_corr_chisquare = chisquare(X_dict['cat_' + key])
print(X_dict[key].dtypes)
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_theme(style="white")
sns.set(font_scale=2)
def plot_matrix_corr(df_corr, fontsize=16, xsize=9, ysize=9):
cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.figure(figsize=(xsize,ysize))
sns.heatmap(df_corr, xticklabels = df_corr.columns, yticklabels = df_corr.columns, annot=True,
linewidths=0.5, cmap = cmap, fmt='.3f',
annot_kws={
'fontsize': fontsize,
'fontweight': 'bold',
'fontfamily': 'serif'}
)
plot_matrix_corr(df_corr_pearson, 32, 16, 12)
plot_matrix_corr(df_corr_spearman, 32, 16, 12)
sns.set_theme(style="white")
plot_matrix_corr(df_corr_chisquare, 8, 22, 14)
About Condition Entropy.
Here, $X$ is the response and $Y$ is the feature being considered. Two properties:
import scipy.stats as stats
def anova(X_dict, key, target):
anova_df = X_dict['num_' + key].merge(df[target], left_index=True, right_index=True)
results_list = list()
for feature in X_dict['num_' + key].keys():
res = stats.f_oneway(anova_df.loc[anova_df[target]==0.0, feature].values,
anova_df.loc[anova_df[target]==1.0, feature].values)
results_list = results_list + [[feature, res.statistic, res.pvalue]]
return (pd.DataFrame(results_list, columns=['feature', 'F-Statistic', 'p-value']))
feature = '34'
for target in targets:
key = feature + '_' + target
print('key:', key)
display(anova(X_dict, key, targets[target]))
import scipy.stats as stats
for target in targets:
key = feature + '_' + target
print('key:', key)
sample = 1000
ttest_df = X_dict['num_' + key].merge(df[targets[target]], left_index=True, right_index=True)
results_list = list()
for feat in X_dict['num_' + key].keys():
res = stats.ttest_ind(ttest_df.loc[ttest_df[targets[target]]==0.0, feat].head(sample),
ttest_df.loc[ttest_df[targets[target]]==1.0, feat].head(sample),
equal_var = True)
results_list = results_list + [[feat, res.statistic, res.pvalue]]
ttest_results = pd.DataFrame(results_list, columns=['feature', 'T-Statistic', 'p-value'])
display(ttest_results)
We split by time (exposure start). The 80% first patients as training set, the 20% last patients as test set.
x_train = dict()
y_train = dict()
x_test = dict()
y_test = dict()
format='%d%b%Y'
df['exposure_start'] = pd.to_datetime(df['exposure_start'], format=format)
for feature_set in X_dict:
print(feature_set)
X_sorted = X_dict[feature_set].merge(df['exposure_start'], left_index=True, right_index=True).sort_values(by = 'exposure_start')
y_sorted = y_dict[feature_set][X_sorted.index]
limit = int(len(X_sorted)*.8) # We keep 80% for the training set
train_index = X_sorted.index[:limit]
test_index = X_sorted.index[limit:]
x_train[feature_set] = X_dict[feature_set][X_dict[feature_set].index.isin(train_index)]
y_train[feature_set] = y_dict[feature_set][y_dict[feature_set].index.isin(train_index)]
x_test[feature_set] = X_dict[feature_set][X_dict[feature_set].index.isin(test_index)]
y_test[feature_set] = y_dict[feature_set][y_dict[feature_set].index.isin(test_index)]
from sklearn.metrics import roc_auc_score
def results(clf, X1, y, v):
X = X1
if isinstance(X1, pd.Series):
X = X1.to_frame()
y_pred = clf.predict(X)
y_score = clf.predict_proba(X)[:, 1]
target_names = ['No Response', 'Response']
multiclass = 'raise'
if '_multi' in v:
y_score = clf.predict_proba(X)
multiclass = 'ovr'
target_names = ['No Response', 'Equivocal', 'Response']
if '_exp' in v:
target_names = ['Lithium', 'Olanzapine']
if '_lithium2y' in v:
target_names = ['other', 'lithium2y']
#print(classification_report(y, y_pred, target_names=target_names))
result = classification_report(y, y_pred, target_names=target_names, output_dict=True)
# Compute confusion matrix
cm = confusion_matrix(y, y_pred)
#print(cm)
print('Balanced Accuracy:', result['macro avg']['recall'])
# For average and multi_class parameters, check doc:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
try:
roc_auc_score_val = roc_auc_score(y, y_score, average='weighted', multi_class=multiclass)
except:
# 34_multi fails with Naive Bayes because MixedNB predict_proba() seems to return proba values that don't sum up to 1...
roc_auc_score_val = None
result['roc_auc'] = roc_auc_score_val
print('ROC_AUC score:', roc_auc_score_val)
# Show confusion matrix in a separate window
color_map = plt.cm.get_cmap('Blues')
plt.matshow(cm, cmap=color_map)
plt.title('Confusion matrix\n')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
return(result)
import warnings
def evaluate(clf, x_test, y_test):
all_results = pd.DataFrame(columns=['features',
'balanced accuracy',
'accuracy',
'roc_auc',
'f1 (response)',
'f1 (equivocal)',
'f1 (no response)',
'f1 (lithium)',
'f1 (olanzapine)',
'f1 (lithium > 2y)',
'f1 (other)',
'f1 (weighted avg)'])
for v in clf:
print(40*'_' + v + 40*'_')
v2 = v.replace('_balanced_accuracy', '').replace('_accuracy', '').replace('_f1_weighted', '')
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = results(clf[v], x_test[v2], y_test[v2], v)
result_dict = {'features': v,
'accuracy': result['accuracy'],
'roc_auc': result['roc_auc'],
'balanced accuracy': result['macro avg']['recall'],
'f1 (weighted avg)': result['weighted avg']['f1-score']
}
if '_exp' in v:
result_dict.update({
'f1 (lithium)': result['Lithium']['f1-score'],
'f1 (olanzapine)': result['Olanzapine']['f1-score'],
})
elif '_multi' in v:
result_dict.update({
'f1 (no response)': result['No Response']['f1-score'],
'f1 (equivocal)': result['Equivocal']['f1-score'],
'f1 (response)': result['Response']['f1-score']
})
elif '_lithium2y' in v:
result_dict.update({
'f1 (other)': result['other']['f1-score'],
'f1 (lithium > 2y)': result['lithium2y']['f1-score']
})
else:
result_dict.update({
'f1 (no response)': result['No Response']['f1-score'],
'f1 (response)': result['Response']['f1-score']
})
all_results = all_results.append(result_dict,
ignore_index=True)
print(83*'_')
print('\n\n\n')
return all_results
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegressionCV
@ignore_warnings(category=ConvergenceWarning) # max_iter default value (=100?) triggers this warning
def run(X1, y, l1, l2):
X = X1
if isinstance(X1, pd.Series):
X = X1.to_frame()
return LogisticRegressionCV(penalty='elasticnet',
l1_ratios=[l1, l2],
cv=5,
solver='saga',
scoring='balanced_accuracy',
n_jobs=4).fit(X, y)
clf_multi_dict = dict()
# Elastic net regularisation for larger feature sets
l1 = .8
l2 = .2
for v in [f for f in X_dict if len(X_dict[f].columns) > 5]:
print(v)
clf_multi_dict[v] = run(x_train[v], y_train[v], l1, l2) # a lot of features (more than 5)
for v in [f for f in X_dict if len(X_dict[f].columns) <= 5]:
print(v)
clf_multi_dict[v] = run(x_train[v], y_train[v], l2, l1) # NOT a lot of features (5 or less)
all_results = evaluate(clf_multi_dict, x_test, y_test)
display(all_results)
import shap
def shap_plot(clf, X, sample=50):
explainer = shap.KernelExplainer(clf.predict, shap.sample(X, sample))
shap_values = explainer.shap_values(shap.sample(X, sample), l1_reg="num_features("+str(X.shape[1])+")")
shap.summary_plot(shap_values, shap.sample(X, sample), feature_names=X.columns)
shap_plot(clf_multi_dict['13_lithium2y'], X_dict['13_lithium2y'], 100)
Inspired by: https://github.com/lmcinnes/umap/issues/58 More precisely: https://github.com/lmcinnes/umap/issues/58#issuecomment-419682509
def umap_embedding(X_dict, n_neighbors=15, weight=0.5):
import umap.umap_ as umap
fit1 = umap.UMAP(n_neighbors=n_neighbors, metric='braycurtis', random_state=42).fit(X_dict['num'].values)
fit2 = umap.UMAP(n_neighbors=n_neighbors, metric='jaccard', random_state=42).fit(X_dict['bin'].values)
intersection = umap.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=weight)
intersection = umap.reset_local_connectivity(intersection)
embedding = umap.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components,
fit1._initial_alpha, fit1._a, fit1._b,
fit1.repulsion_strength, fit1.negative_sample_rate,
200, 'random', np.random, fit1.metric,
fit1._metric_kwds, False,
densmap_kwds={}, output_dens=False)
return embedding
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set(style='white', context='poster', rc={'figure.figsize':(14,10)})
def plot_umap(X_dict, n_neighbors, weight, min_dist=0.1, n_components=2):
import umap.umap_ as umap
import matplotlib.pyplot as plt
print('n_neighbors={:.0f} min_dist={:.1f} weight={:.1f} {:.0f}D'.format(n_neighbors, min_dist, weight, n_components))
fit1 = umap.UMAP(n_neighbors=n_neighbors, metric='braycurtis', random_state=42, min_dist=min_dist, n_components=n_components).fit(X_dict['num'].values)
fit2 = umap.UMAP(n_neighbors=n_neighbors, metric='jaccard', random_state=42, min_dist=min_dist, n_components=n_components).fit(X_dict['bin'].values)
intersection = umap.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=weight)
intersection = umap.reset_local_connectivity(intersection)
embedding = umap.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components,
fit1._initial_alpha, fit1._a, fit1._b,
fit1.repulsion_strength, fit1.negative_sample_rate,
200, 'random', np.random, fit1.metric,
fit1._metric_kwds, False,
densmap_kwds={}, output_dens=False
)
plt.clf()
fig = plt.figure()
if n_components == 3:
ax = fig.add_subplot(111, projection='3d')
ax.scatter(
embedding[0][:, 0],
embedding[0][:, 1],
embedding[0][:, 2],
c=[sns.color_palette()[x] for x in X.exposure])
else:
ax = fig.add_subplot(111)
ax.scatter(
embedding[0][:, 0],
embedding[0][:, 1],
c=[sns.color_palette()[x] for x in X.exposure])
#plt.gca().set_aspect('equal', 'datalim')
plt.title('n_neighbors={:.0f} min_dist={:.1f} weight={:.1f} {:.0f}D'.format(n_neighbors, min_dist, weight, n_components), fontsize=16)
plt.savefig('umap_images/n' + str(n_neighbors) + '_md{:.1f}_w{:.1f}_'.format(min_dist, weight) + str(n_components) + 'd.png')
def plot_umap_2y(X_dict, n_neighbors, weight, min_dist=0.1, n_components=2):
import umap.umap_ as umap
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
print('n_neighbors={:.0f} min_dist={:.1f} weight={:.1f} {:.0f}D'.format(n_neighbors, min_dist, weight, n_components))
fit1 = umap.UMAP(n_neighbors=n_neighbors, metric='braycurtis', random_state=42, min_dist=min_dist, n_components=n_components).fit(X_dict['num_13_lithium2y'].values)
fit2 = umap.UMAP(n_neighbors=n_neighbors, metric='jaccard', random_state=42, min_dist=min_dist, n_components=n_components).fit(X_dict['cat_13_lithium2y'].values)
intersection = umap.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=weight)
intersection = umap.reset_local_connectivity(intersection)
embedding = umap.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components,
fit1._initial_alpha, fit1._a, fit1._b,
fit1.repulsion_strength, fit1.negative_sample_rate,
200, 'random', np.random, fit1.metric,
fit1._metric_kwds, False,
densmap_kwds={}, output_dens=False
)
plt.clf()
fig = plt.figure(figsize=(36,18))
if n_components == 3:
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embedding[0][:, 0],
embedding[0][:, 1],
embedding[0][:, 2],
c=[sns.color_palette()[col] for col in y_dict['13_lithium2y'].astype(int)])
classes = ['Lithium for more than 2 years', 'Other']
class_colours = sns.color_palette()
recs = []
for i in range(0, len(class_colours)):
recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i]))
ax.legend(recs, classes, loc=1)
else:
ax = fig.add_subplot(111)
ax.scatter(
embedding[0][:, 0],
embedding[0][:, 1],
c=[sns.color_palette()[x] for x in y_dict['13_lithium2y'].astype(int)])
#plt.gca().set_aspect('equal', 'datalim')
plt.title('n_neighbors={:.0f} min_dist={:.1f} weight={:.1f} {:.0f}D'.format(n_neighbors, min_dist, weight, n_components), fontsize=16)
plt.savefig('umap_images/n' + str(n_neighbors) + '_md{:.1f}_w{:.1f}_'.format(min_dist, weight) + str(n_components) + 'd_2y.png')
return embedding
embedding = plot_umap_2y(X_dict, 50, 0, 0.1, 3)
for n_neighbors in range(50, 250, 20):
for weight in np.linspace(0, 1, 5):
for n_components in range(2,4):
for min_dist in np.linspace(.1, .9, 4):
plot_umap_2y(X_dict, n_neighbors, weight, min_dist=min_dist, n_components=n_components)
print('done')
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pca = PCA(n_components=9)
pipe = Pipeline([('pca', pca), ('logistic', clf_multi_dict['all34'])])
pipe.fit(X_dict['all34'], y)
#predictions = pipe.predict(X)
plt.figure(1, figsize=(8, 6))
plt.clf()
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n components')
plt.ylabel('explained variance')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
display(pd.DataFrame(pca.components_,columns=X_dict['all34'].columns))
from sklearn.ensemble import RandomForestClassifier
@ignore_warnings(category=ConvergenceWarning) # max_iter default value (=100?) triggers this warning
def run(X1, y):
X = X1
if isinstance(X1, pd.Series):
X = X1.to_frame()
return RandomForestClassifier(max_depth=2, random_state=0).fit(X, y)
clf2_multi_dict = dict()
for v in X_dict:
print(v)
clf2_multi_dict.update({v: run(X_dict[v], y_dict[v])})
all_results2 = evaluate(clf2_multi_dict, x_test, y_test)
display(all_results2)
for v in clf2_multi_dict.keys():
importances = clf2_multi_dict[v].feature_importances_
indices = np.argsort(importances)
features = X_dict[v].columns
plt.figure(1, figsize=(8, 10))
plt.clf()
plt.title('Feature Importances ' + v)
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
import shap
for v in clf2_multi_dict.keys():
print(40*'_' + v + 40*'_')
shap_values = shap.TreeExplainer(clf2_multi_dict[v]).shap_values(X_dict[v])
shap.summary_plot(shap_values, X_dict[v], plot_type="bar")
print(83*'_')
print('\n\n\n')
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from mixed_naive_bayes import MixedNB
clf3_dict = dict()
for v in [feats for feats in X_dict if ('_13' not in feats) and ('_3' not in feats)]:
print(v)
gnb = GaussianNB()
v_num = 'num_' + v
clf3_dict[v_num] = gnb.fit(X_dict[v_num], y_dict[v_num])
bnb = BernoulliNB()
v_bin = 'bin_' + v
clf3_dict[v_bin] = bnb.fit(X_dict[v_bin], y_dict[v_bin])
bin_features = [X_dict[v].columns.get_loc(c) for c in list(X_dict['bin_' + v].columns)]
mnb = MixedNB(categorical_features=bin_features)
clf3_dict[v] = mnb.fit(np.array(X_dict[v]), np.array(y_dict[v]))
all_results3 = evaluate(clf3_dict, x_test, y_test)
roc_auc_score returns an error for 34_multi. Looks like predict_proba() for MixedNB returns probas that don't sum up to 1. Strange.
v = '34_multi'
v2 = v
y_score = clf3_dict[v].predict_proba(x_test[v2])
print(y_score[0])
roc_auc_score(y_test[v2], y_score, average='weighted', multi_class='ovr')
#result = results(clf3_dict[v], x_test[v2], y_test[v2], v)
display(all_results3)
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope
import time
param_hyperopt= {
'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}
def hyperopt_lgbm(param_space, X_train, y_train, X_test, y_test, num_eval, metric):
start = time.time()
def objective_function(params):
clf = lgb.LGBMClassifier(**params)
score = cross_val_score(clf, X_train, y_train, cv=5, scoring=metric).mean()
return {'loss': -score, 'status': STATUS_OK}
trials = Trials()
best_param = fmin(objective_function,
param_space,
algo=tpe.suggest,
max_evals=num_eval,
trials=trials)
loss = [x['result']['loss'] for x in trials.trials]
best_param_dict = space_eval(param_space, best_param)
clf_best = lgb.LGBMClassifier(**best_param_dict)
clf_best.fit(X_train, y_train)
print("")
print("##### Results")
print("Score best parameters: ", min(loss)*-1)
print("Best parameters: ", best_param_dict)
print("Test Score: ", clf_best.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)
print("Parameter combinations evaluated: ", num_eval)
return clf_best, best_param_dict
results_hyperopt_lgbm = dict()
best_params_dict_lgbm = dict()
for target in ['lithium2y']:
for feature_number in ['13']:
feature_set = feature_number + '_' + target
print('\n' + 10*'_' + feature_set + 10*'_')
for metric in ['balanced_accuracy']:
print('\n\t----> metric:', metric)
results_hyperopt_lgbm[feature_set + '_' + metric], best_params_dict_lgbm[feature_set + '_' + metric] = hyperopt_lgbm(param_hyperopt,
x_train[feature_set],
y_train[feature_set],
x_test[feature_set],
y_test[feature_set],
1000, metric)
all_results_hyper_lgbm = evaluate(results_hyperopt_lgbm, x_test, y_test)
display(all_results_hyper_lgbm)
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope
import time
param_hyperopt= {
'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
'min_samples_split':hp.uniform('min_samples_split',0,1),
'min_samples_leaf':hp.randint('min_samples_leaf',1,10),
'criterion':hp.choice('criterion', ['gini','entropy']),
'max_features':hp.choice('max_features',['sqrt', 'log2'])
}
def hyperopt_rf(param_space, X_train, y_train, X_test, y_test, num_eval, metric):
start = time.time()
def objective_function(params):
clf = RandomForestClassifier(**params, random_state=42)
clf.fit(X_train, y_train)
score = cross_val_score(clf, X_train, y_train, cv=5, scoring=metric).mean()
return {'loss': -score, 'status': STATUS_OK}
trials = Trials()
best_param = fmin(objective_function,
param_space,
algo=tpe.suggest,
max_evals=num_eval,
trials=trials)
best_param_dict = space_eval(param_space, best_param)
loss = [x['result']['loss'] for x in trials.trials]
clf_best = RandomForestClassifier(**best_param_dict)
clf_best.fit(X_train, y_train)
print("##### Results")
print("Score best parameters: ", min(loss)*-1)
print("Best parameters: ", best_param)
print("Test Score: ", clf_best.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)
print("Parameter combinations evaluated: ", num_eval)
#results(clf_best, X_test, y_test)
return clf_best, best_param_dict
results_hyperopt = dict()
best_params_dict = dict()
for target in ['lithium2y']:
for feature_number in ['13']:
feature_set = feature_number + '_' + target
print('\n' + 10*'_' + feature_set + 10*'_')
for metric in ['balanced_accuracy']:
print('\n\t----> metric:', metric)
results_hyperopt[feature_set + '_' + metric], best_params_dict[feature_set + '_' + metric] = hyperopt_rf(param_hyperopt,
x_train[feature_set],
y_train[feature_set],
x_test[feature_set],
y_test[feature_set],
1000, metric)
all_results_hyper_rf = evaluate(results_hyperopt, x_test, y_test)
display(all_results_hyper_rf)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from umap import UMAP
# Classification with a linear SVM
svc = LinearSVC(dual=False, random_state=123)
params_grid = {"C": [10 ** k for k in range(-3, 4)]}
clf = GridSearchCV(svc, params_grid, scoring='balanced_accuracy')
feature_set = '13_lithium2y'
clf.fit(x_train[feature_set], y_train[feature_set])
print(
"Balanced accuracy on the test set with raw data: {:.3f}".format(clf.score(x_test[feature_set], y_test[feature_set]))
)
# Transformation with UMAP followed by classification with a linear SVM
umap = UMAP(random_state=456)
pipeline = Pipeline([("umap", umap), ("svc", svc)])
params_grid_pipeline = {
"umap__n_neighbors": [5, 20],
"umap__n_components": [15, 25, 50],
"svc__C": [10 ** k for k in range(-3, 4)],
}
clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline, scoring='balanced_accuracy')
clf_pipeline.fit(x_train[feature_set], y_train[feature_set])
print(
"Balanced accuracy on the test set with UMAP transformation: {:.3f}".format(
clf_pipeline.score(x_test[feature_set], y_test[feature_set])
)
)
print("Balanced accuracy on the test set with raw data: {:.6f}".format(clf.score(x_test[feature_set], y_test[feature_set])),
"\nBalanced accuracy on the test set with UMAP transformation: {:.6f}".format(clf_pipeline.score(x_test[feature_set], y_test[feature_set])))